#Scraping data from Skytrax
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pandas as pd
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100
reviews = []
# for i in range(1, pages + 1):
for i in range(1, pages + 1):
print(f"Scraping page {i}")
# Create URL to collect links from paginated data
url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
# Collect HTML data from this page
response = requests.get(url)
# Parse content
content = response.content
parsed_content = BeautifulSoup(content, 'html.parser')
for para in parsed_content.find_all("div", {"class": "text_content"}):
reviews.append(para.get_text())
print(f" ---> {len(reviews)} total reviews")
Scraping page 1 ---> 100 total reviews Scraping page 2 ---> 200 total reviews Scraping page 3 ---> 300 total reviews Scraping page 4 ---> 400 total reviews Scraping page 5 ---> 500 total reviews Scraping page 6 ---> 600 total reviews Scraping page 7 ---> 700 total reviews Scraping page 8 ---> 800 total reviews Scraping page 9 ---> 900 total reviews Scraping page 10 ---> 1000 total reviews
df = pd.DataFrame()
df["reviews"] = reviews
df.head()
| reviews | |
|---|---|
| 0 | ✅ Trip Verified | Prior to boarding a gate a... |
| 1 | ✅ Trip Verified | I flew from Amsterdam to L... |
| 2 | ✅ Trip Verified | First the good news, the clu... |
| 3 | ✅ Trip Verified | I have never travelled wit... |
| 4 | ✅ Trip Verified | Terrible overall, medium ser... |
# saving data into CSV
df.to_csv("data/BA_reviews.csv")
#imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
#regex
import re
import os
import pandas as pd
# Get the current working directory
cwd = os.getcwd()
print(f"Current working directory: {cwd}") # Debug: Check where you are
# Define the correct path to your CSV file
file_path = "C:/Users/sangr/data/BA_reviews.csv"
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, index_col=0)
# Display the first few rows to verify
print(df.head())
Current working directory: C:\Users\sangr
reviews
0 ✅ Trip Verified | Prior to boarding a gate a...
1 ✅ Trip Verified | I flew from Amsterdam to L...
2 ✅ Trip Verified | First the good news, the clu...
3 ✅ Trip Verified | I have never travelled wit...
4 ✅ Trip Verified | Terrible overall, medium ser...
# will also create a column which mentions if the user is verified or not.
df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']
0 True
1 True
2 True
3 True
4 True
...
995 True
996 False
997 True
998 True
999 True
Name: verified, Length: 1000, dtype: bool
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package wordnet to [nltk_data] C:\Users\sangr\AppData\Roaming\nltk_data... [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\sangr\AppData\Roaming\nltk_data... [nltk_data] Unzipping corpora\stopwords.zip. [nltk_data] Downloading package punkt to [nltk_data] C:\Users\sangr\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
True
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()
reviews_data = df.reviews.str.strip("✅ Trip Verified |")
#create an empty list to collect cleaned data corpus
corpus =[]
#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
rev = re.sub('[^a-zA-Z]',' ', rev)
rev = rev.lower()
rev = rev.split()
rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
rev = " ".join(rev)
corpus.append(rev)
# add the corpus to the original dataframe
df['corpus'] = corpus
df.head()
| reviews | verified | corpus | |
|---|---|---|---|
| 0 | ✅ Trip Verified | Prior to boarding a gate a... | True | prior boarding gate agent seemed pick elderly ... |
| 1 | ✅ Trip Verified | I flew from Amsterdam to L... | True | flew amsterdam la vega layover heathrow novemb... |
| 2 | ✅ Trip Verified | First the good news, the clu... | True | first good news club suite huge improvement ol... |
| 3 | ✅ Trip Verified | I have never travelled wit... | True | never travelled british airway first time chos... |
| 4 | ✅ Trip Verified | Terrible overall, medium ser... | True | ble overall medium service flight delayed help... |
#Cleaning/Fromat date
df.dtypes
reviews object verified bool corpus object dtype: object
#Check for null Values
df.isnull().value_counts()
reviews verified corpus False False False 1000 Name: count, dtype: int64
df.shape
(1000, 3)
#resetting the index
df.reset_index(drop=True)
| reviews | verified | corpus | |
|---|---|---|---|
| 0 | ✅ Trip Verified | Prior to boarding a gate a... | True | prior boarding gate agent seemed pick elderly ... |
| 1 | ✅ Trip Verified | I flew from Amsterdam to L... | True | flew amsterdam la vega layover heathrow novemb... |
| 2 | ✅ Trip Verified | First the good news, the clu... | True | first good news club suite huge improvement ol... |
| 3 | ✅ Trip Verified | I have never travelled wit... | True | never travelled british airway first time chos... |
| 4 | ✅ Trip Verified | Terrible overall, medium ser... | True | ble overall medium service flight delayed help... |
| ... | ... | ... | ... |
| 995 | ✅ Trip Verified | I have to say travelling in ... | True | say travelling club europe waste money food be... |
| 996 | Not Verified | I had a stress free journey wi... | False | verified stress free journey yr old autistic s... |
| 997 | ✅ Trip Verified | Edinburgh to Kuala Lumpur v... | True | edinburgh kuala lumpur via london returned kl ... |
| 998 | ✅ Trip Verified | I was supposed to fly from ... | True | supposed fly london city amsterdam business cl... |
| 999 | ✅ Trip Verified | I purchased a ticket for Du... | True | purchased ticket dublin mauritius british airw... |
1000 rows × 3 columns
# export the cleaned data
df.to_csv(cwd + "/cleaned-BA-reviews.csv")
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import datetime as dt
from wordcloud import WordCloud, STOPWORDS
# create the dataframe
cwd = os.getcwd()
df = pd.read_csv(cwd+"/cleaned-BA-reviews.csv", index_col=0)
#let's also check the index are in order
df = df.reset_index(drop=True)
df.head()
| reviews | verified | corpus | |
|---|---|---|---|
| 0 | ✅ Trip Verified | Prior to boarding a gate a... | True | prior boarding gate agent seemed pick elderly ... |
| 1 | ✅ Trip Verified | I flew from Amsterdam to L... | True | flew amsterdam la vega layover heathrow novemb... |
| 2 | ✅ Trip Verified | First the good news, the clu... | True | first good news club suite huge improvement ol... |
| 3 | ✅ Trip Verified | I have never travelled wit... | True | never travelled british airway first time chos... |
| 4 | ✅ Trip Verified | Terrible overall, medium ser... | True | ble overall medium service flight delayed help... |
#What is the average overall rating given for British Airways?
# Load your file (replace 'your_file.csv' with your actual file name)
data = pd.read_csv("C:/Users/sangr/Downloads/cleaned-BA-reviews.csv")
# 1. Basic Overview
print("Dataset Shape:", data.shape) # Number of rows and columns
print("\nDataset Info:")
print(data.info()) # Data types and missing values
print("\nFirst 5 Rows:")
print(data.head()) # Preview the data
Dataset Shape: (3411, 7)
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3411 entries, 0 to 3410
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 3411 non-null int64
1 reviews 3411 non-null object
2 stars 3411 non-null int64
3 date 3411 non-null object
4 country 3411 non-null object
5 verified 3411 non-null bool
6 corpus 3411 non-null object
dtypes: bool(1), int64(2), object(4)
memory usage: 163.4+ KB
None
First 5 Rows:
Unnamed: 0 reviews stars \
0 0 Not Verified | Worst experience ever. Outbound... 5
1 1 ✅ Trip Verified | Check in was a shambles at ... 1
2 2 ✅ Trip Verified | Beyond disgusted with the fa... 5
3 3 ✅ Trip Verified | On July 19th 2022 I had subm... 1
4 4 ✅ Trip Verified | I booked the flight on Oct ... 1
date country verified \
0 2022-11-07 Italy False
1 2022-11-07 Malaysia True
2 2022-11-05 United Arab Emirates True
3 2022-10-31 United States True
4 2022-10-31 United States True
corpus
0 verified worst experience ever outbound flight...
1 check shamble bwi counter open full flight bag...
2 beyond disgusted fact baggage yet delivered we...
3 july th submitted complaint form regard fact b...
4 booked flight oct cancel flight day learning g...
# 2. Summary Statistics
print("\nSummary Statistics:")
print(data.describe()) # Mean, median, std, etc. for numeric columns
Summary Statistics:
Unnamed: 0 stars
count 3411.000000 3411.000000
mean 1705.766051 4.841102
std 985.966935 3.144230
min 0.000000 1.000000
25% 852.500000 2.000000
50% 1705.000000 4.000000
75% 2557.500000 8.000000
max 3417.000000 10.000000
# 3. Check for Missing Values
print("\nMissing Values:")
print(data.isnull().sum())
Missing Values: Unnamed: 0 0 reviews 0 stars 0 date 0 country 0 verified 0 corpus 0 dtype: int64
# 4. Visualizations
# Histogram for numeric columns
data.hist(bins=20, figsize=(10, 8))
plt.tight_layout()
plt.show()
# Correlation heatmap (for numeric data)
numeric_data = data.select_dtypes(include=['float64', 'int64']) # Only numeric columns
if not numeric_data.empty:
plt.figure(figsize=(8, 6))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()
else:
print("No numeric columns available for correlation heatmap.")
print(df.columns)
Index(['reviews', 'verified', 'corpus'], dtype='object')
df = pd.read_csv("C:/Users/sangr/Downloads/cleaned-BA-reviews.csv", encoding="utf-8", low_memory=False)
print(df.columns)
Index(['Unnamed: 0', 'reviews', 'stars', 'date', 'country', 'verified',
'corpus'],
dtype='object')
# Ensure column names are stripped of any spaces
df.columns = df.columns.str.strip()
# Convert 'stars' column to numeric in case of any issues
df['stars'] = pd.to_numeric(df['stars'], errors='coerce')
# Calculate the average rating
average_rating = df['stars'].mean()
print("Average Rating:", average_rating)
Average Rating: 4.841102316036353
df.stars.value_counts().plot(kind="bar")
plt.xlabel("Ratings")
plt.ylabel("Total Number of reviews with that rating")
plt.suptitle("Counts for each ratings")
Text(0.5, 0.98, 'Counts for each ratings')
#resetting index as we do not want to confuse between the index and the rating values
df_ratings = df_ratings.reset_index()
# renaming columns
df_ratings.rename(columns={'index':'Stars', 'stars':'total_counts'}, inplace=True)
df_ratings
| total_counts | count | |
|---|---|---|
| 0 | 1 | 735 |
| 1 | 2 | 382 |
| 2 | 3 | 379 |
| 3 | 8 | 349 |
| 4 | 10 | 306 |
| 5 | 7 | 299 |
| 6 | 9 | 293 |
| 7 | 5 | 259 |
| 8 | 4 | 227 |
| 9 | 6 | 182 |
df_ratings.columns = df_ratings.columns.str.strip().str.lower() # Standardize column names
print(df_ratings.columns)
Index(['stars', 'total_counts'], dtype='object')
clrs = ['Red' if (x == max(df_ratings.total_counts)) else 'grey' for x in df_ratings.total_counts ]
ax = sns.barplot(x=df_ratings.stars, y=df_ratings.total_counts, data=df_ratings, errwidth=0,
palette=clrs)
ax.bar_label(ax.containers[0])
ax.set_xlabel("Ratings")
ax.set_ylabel("Total Number of reviews with that rating")
ax.set_title("Counts for each ratings")
Text(0.5, 1.0, 'Counts for each ratings')
# Unique countries BA recieved the reviews from
print(f"{len(df.country.unique())} unique countries")
69 unique countries
# Which country most review comes from?
df_country_review = pd.DataFrame(df.country.value_counts().head()).reset_index()
df_country_review.rename(columns={'index':'country','country':'total_reviews'}, inplace=True)
print(df.columns)
Index(['Unnamed: 0', 'reviews', 'stars', 'date', 'country', 'verified',
'corpus'],
dtype='object')
df_country_review = df.groupby("country").size().reset_index(name="total_reviews")
df.columns = df.columns.str.strip().str.lower()
print(df.columns) # Check again
Index(['unnamed: 0', 'reviews', 'stars', 'date', 'country', 'verified',
'corpus'],
dtype='object')
import pandas as pd
import matplotlib.pyplot as plt
# Check if 'country' exists
if 'country' in df.columns:
# Count number of reviews per country
df_country_review = df.groupby("country").size().reset_index(name="total_reviews")
# Get top 5 countries with most reviews
top_countries = df_country_review.sort_values(by="total_reviews", ascending=False).head(5)
# Define vibrant colors
vibrant_colors = ['#FF355E', '#FD5B78', '#FF6037', '#FFCC33', '#66FF66'] # Cherry Red, Coral, Orange, Yellow, Lime Green
# Plot the pie chart
plt.figure(figsize=(8, 6))
plt.pie(top_countries["total_reviews"],
labels=top_countries["country"],
autopct='%1.1f%%',
colors=vibrant_colors,
startangle=140,
wedgeprops={'edgecolor': 'white', 'linewidth': 2},
textprops={'fontsize': 12, 'color': 'black'})
plt.title("Top 5 Countries by Number of Reviews",
fontsize=16,
color='#FF355E',
pad=20)
# Add a slight shadow effect
plt.pie(top_countries["total_reviews"],
labels=None,
colors=vibrant_colors,
startangle=140,
radius=0.85,
wedgeprops={'edgecolor': 'gray', 'linewidth': 1, 'alpha': 0.3})
plt.show()
else:
print("❌ 'country' column not found in the dataset!")
# Which country provided on average highest ratings?
import pandas as pd
import matplotlib.pyplot as plt
# Check if 'country' and 'stars' exist
if 'country' in df.columns and 'stars' in df.columns:
# Ensure 'stars' is numeric (convert if needed)
df['stars'] = pd.to_numeric(df['stars'], errors='coerce') # Converts non-numeric to NaN
# Group by country and calculate mean stars, then sort and reset index
df_country_rating = pd.DataFrame(
df.groupby('country')['stars'].mean().sort_values(ascending=False)
).reset_index()
# Get top 10 countries by average rating (changed from 5 to 10)
top_countries = df_country_rating.head(10)
# Define vibrant colors (extended to 10 colors)
vibrant_colors = ['#FF355E', '#FD5B78', '#FF6037', '#FFCC33', '#66FF66',
'#00CCCC', '#FF00FF', '#FF6F61', '#6B5B95', '#88B04B'] # Added Cyan, Magenta, Coral, Purple, Olive Green
# Plot the bar chart
plt.figure(figsize=(12, 6)) # Slightly wider figure to accommodate 10 bars
bars = plt.bar(top_countries['country'],
top_countries['stars'],
color=vibrant_colors,
edgecolor='white', # White borders for contrast
linewidth=2) # Border thickness
# Customize the chart
plt.title("Top 10 Countries by Average Rating",
fontsize=16,
color='#FF355E',
pad=20) # Updated title
plt.xlabel("Country", fontsize=12, color='black')
plt.ylabel("Average Rating", fontsize=12, color='black')
# Add value labels on top of each bar
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval + 0.05,
f'{yval:.1f}', # Display value with 1 decimal
ha='center', va='bottom', fontsize=10, color='black')
# Adjust layout and grid
plt.grid(axis='y', linestyle='--', alpha=0.7) # Light grid on y-axis
plt.xticks(rotation=45, ha='right') # Rotate country names for better fit
plt.tight_layout()
plt.show()
else:
print("❌ Required columns ('country' or 'stars') not found in the dataset!")
Time Series Analysis
#convert the date datatype to datetime
df.date = pd.to_datetime(df.date)
fig = px.line(df, x='date', y="stars")
fig.update_xaxes(rangeslider_visible=True)
fig.show()
import nltk
from nltk.corpus import stopwords
# Start with one review:
reviews = " ".join(df.corpus)
plt.figure(figsize=(10,10))
stopwords = set(stopwords.words('english'))
# Create and generate a word cloud image:
wordcloud = WordCloud(height=600,width=600,max_font_size=100, max_words=500, stopwords=stopwords).generate(reviews)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
import nltk
from nltk.corpus import stopwords
reviews = " ".join(df.corpus)
plt.figure(figsize=(20,10))
stopwords = set(stopwords.words('english'))
stopwords.update(["ba","flight", "british","airway", "airline","plane", "told","also","passenger" \
"london", "heathrow", "aircraft", "could","even", "would"])
# Create and generate a word cloud image:
wordcloud = WordCloud(height=500,width=500,max_font_size=100, max_words=300, stopwords=stopwords).generate(reviews)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
from nltk import ngrams
from nltk.probability import FreqDist
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
#split the text of all reviews into a list of words
words = reviews.split(" ")
#remove certain words that will not be used to determine the positive or negative sentiment
stopwords = text.ENGLISH_STOP_WORDS.union(['flight', 'ba', "passenger","u", "london","airway","british","airline",\
"heathrow","plane","lhr","review"])
new_words = [word for word in words if word not in stopwords]
nlp_words=FreqDist(new_words).most_common(20)
#create a dataframe of these word and its frequencies
all_fdist = pd.Series(dict(nlp_words))
## Setting figure, ax into variables
fig, ax = plt.subplots(figsize=(15,8))
## Seaborn plotting using Pandas attributes + xtick rotation for ease of viewing
all_plot = sns.barplot(x=all_fdist.index, y=all_fdist.values, ax=ax)
all_plot.bar_label(all_plot.containers[0])
plt.xticks(rotation=30)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19]),
[Text(0, 0, 'seat'),
Text(1, 0, 'service'),
Text(2, 0, 'food'),
Text(3, 0, 'time'),
Text(4, 0, 'crew'),
Text(5, 0, 'cabin'),
Text(6, 0, 'good'),
Text(7, 0, 'class'),
Text(8, 0, 'hour'),
Text(9, 0, 'business'),
Text(10, 0, 'staff'),
Text(11, 0, 'economy'),
Text(12, 0, 'check'),
Text(13, 0, 'drink'),
Text(14, 0, 'meal'),
Text(15, 0, 'return'),
Text(16, 0, 'lounge'),
Text(17, 0, 'club'),
Text(18, 0, 'boarding'),
Text(19, 0, 'experience')])
This gives us a glimpse of what customers are really talking about here. We see that Seat is most talked about the airline followed by "Service" and "food" which are all very important to customers in terms of service. However, we still do not know is how they are expressing about each of this service. To bring some significane to these terms we will use ngram plots to see if they are bad or good in experience.
## Imports
import nltk.collocations as collocations
from nltk import FreqDist, bigrams
reviews = " ".join(df.corpus)
#split the text of all reviews into a list of words
words = reviews.split(" ")
new_words = [word for word in words if word not in stopwords]
def get_freq_dist(new_words,number_of_ngrams ):
from nltk import ngrams
## Generate bigrams
ngrams = ngrams(new_words, number_of_ngrams)
## Creating FreqDist
ngram_fd = FreqDist(ngrams).most_common(40)
## Sort values by highest frequency
ngram_sorted = {k:v for k,v in sorted(ngram_fd, key=lambda item:item[1])}
## Join bigram tokens with '_' + maintain sorting
ngram_joined = {'_'.join(k):v for k,v in sorted(ngram_fd, key=lambda item:item[1])}
## Convert to Pandas series for easy plotting
ngram_freqdist = pd.Series(ngram_joined)
plt.figure(figsize=(10,10))
ax = ngram_freqdist.plot(kind="barh")
return ax
get_freq_dist(new_words,4)
<Axes: >
We can see that there are very common positive terms regarding cabin crew. For example, cabin_crew_friendly_helpful, cabin_crew_friendly_attentive, cabin_crew_friendly_efficient, etc. So certainly customers are providing good reviews about cabin crew staff of British Airways.
However, there is one another approach that we can try to find the word frequencies which will give us better idea. We will group the reviews based on ratings. Say, we assume ratigs 1-3 are bad reviews, 4-6 are average/good experience and 7-10 indicates a great experience.
ratings_1_3 = df[df.stars.isin([1,2,3])]
ratings_4_6 = df[df.stars.isin([4,5,6])]
ratings_7_10 = df[df.stars.isin([7,8,9,10])]
reviews_1_3 = " ".join(ratings_1_3.corpus)
reviews_4_6 = " ".join(ratings_4_6.corpus)
reviews_7_10 = " ".join(ratings_7_10.corpus)
#split the text of all reviews into a list of words
words_1_3 = reviews_1_3.split(" ")
words_4_6 = reviews_4_6.split(" ")
words_7_10 = reviews_7_10.split(" ")
new_words_7_10 = [word for word in words_7_10 if word not in stopwords]
get_freq_dist(new_words_7_10,4)
<Axes: >
new_words = [word for word in words_4_6 if word not in stopwords]
get_freq_dist(new_words,4)
<Axes: >
new_words = [word for word in words_1_3 if word not in stopwords]
get_freq_dist(new_words,4)
<Axes: >
!pip install textblob
Collecting textblob
Obtaining dependency information for textblob from https://files.pythonhosted.org/packages/1e/d6/40aa5aead775582ea0cf35870e5a3f16fab4b967f1ad2debe675f673f923/textblob-0.19.0-py3-none-any.whl.metadata
Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
Obtaining dependency information for nltk>=3.9 from https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl.metadata
Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Requirement already satisfied: click in c:\users\sangr\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (8.0.4)
Requirement already satisfied: joblib in c:\users\sangr\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (1.2.0)
Requirement already satisfied: regex>=2021.8.3 in c:\users\sangr\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (2022.7.9)
Requirement already satisfied: tqdm in c:\users\sangr\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (4.65.0)
Requirement already satisfied: colorama in c:\users\sangr\anaconda3\lib\site-packages (from click->nltk>=3.9->textblob) (0.4.6)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
---------------------------------------- 0.0/624.3 kB ? eta -:--:--
--------------------------------------- 10.2/624.3 kB ? eta -:--:--
--- ----------------------------------- 61.4/624.3 kB 812.7 kB/s eta 0:00:01
---------------- ----------------------- 256.0/624.3 kB 2.3 MB/s eta 0:00:01
--------------------------------------- 614.4/624.3 kB 4.3 MB/s eta 0:00:01
---------------------------------------- 624.3/624.3 kB 3.6 MB/s eta 0:00:00
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
---------------------------------------- 0.0/1.5 MB ? eta -:--:--
-------------------------- ------------- 1.0/1.5 MB 31.7 MB/s eta 0:00:01
---------------------------------------- 1.5/1.5 MB 19.2 MB/s eta 0:00:00
Installing collected packages: nltk, textblob
Attempting uninstall: nltk
Found existing installation: nltk 3.8.1
Uninstalling nltk-3.8.1:
Successfully uninstalled nltk-3.8.1
Successfully installed nltk-3.9.1 textblob-0.19.0
%%capture
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
#set a column Polarity with all 0 values initially
df['polarity'] = 0
for i in range(len(df.corpus)):
sent= TextBlob(df.corpus[i])
polarity = sent.sentiment.polarity
subjectivity = sent.sentiment.subjectivity
df['polarity'][i] = polarity
# let's see how many texts are with positive comments
print(f"{df[(df['polarity'] >-0.2) & (df['polarity'] <0.2)].shape[0]} number of reviews between -0.2 and 0.2 polarity score")
print(f"{df[(df['polarity'] >-0.1) & (df['polarity'] <0.1)].shape[0]} number of reviews between -0.1 and 0.1 polarity score")
2286 number of reviews between -0.2 and 0.2 polarity score 1319 number of reviews between -0.1 and 0.1 polarity score
Polarity score is given between -1 to 1 and more close the value to -1, it indicates negative review and vice versa is true for positive value. If we consider a threshold where any review with polarity greater than 0.2 is positive and less than -0.2 is negative, we are left with 2286 reviews that lies in the neutral zone. To further narrow down this number of neutral reviews, let's take the threshold of 0.1.
We will try another method of labelling the reveiws as positives or negatives. In this we will use VADER algorihtm by nltk library.
%%capture
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vds = SentimentIntensityAnalyzer()
# text = 'This is an extremely entertaining movie'
#set a column Polarity with all 0 values initially
df['label'] = 0
for i in range(len(df.corpus)):
score = vds.polarity_scores(df.corpus[i])['compound']
#print(score)
if score > 0.2:
df['label'][i] = 1
#print("1st")
elif score < 0:
df['label'][i] = -1
#print("2nd")
else:
df['label'][i] = 0
df.label.value_counts()
label 1 2245 -1 1049 0 117 Name: count, dtype: int64
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()
%%capture
from sklearn.feature_extraction.text import CountVectorizer
#create an object of count vectorizer
vect = CountVectorizer()
#apply transformation
tf = vect.fit_transform(df.corpus).toarray()
# get the feature names with the updated method
tf_feature_names = vect.get_feature_names_out()
from sklearn.decomposition import LatentDirichletAllocation
#declare the number of topics
number_of_topics = 8
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
#fit the term frequency data to the model
model.fit(tf)
#create empty dictionary to store key value pair of topic number and its weights
topic_dict = {}
#loop through model components
for topic_idx, topic in enumerate(model.components_):
topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(tf_feature_names[i])
for i in topic.argsort()[:-10 - 1:-1]]
topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
for i in topic.argsort()[:-10 - 1:-1]]
df_topic =pd.DataFrame(topic_dict)
df_topic
| Topic 0 words | Topic 0 weights | Topic 1 words | Topic 1 weights | Topic 2 words | Topic 2 weights | Topic 3 words | Topic 3 weights | Topic 4 words | Topic 4 weights | Topic 5 words | Topic 5 weights | Topic 6 words | Topic 6 weights | Topic 7 words | Topic 7 weights | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | seat | 1174.6 | flight | 1320.1 | flight | 1711.5 | class | 220.7 | seat | 1332.1 | british | 604.6 | flight | 1602.2 | ba | 554.5 |
| 1 | class | 638.1 | crew | 731.1 | hour | 671.9 | seat | 182.4 | ba | 1065.7 | flight | 582.0 | ba | 1380.5 | flight | 541.8 |
| 2 | business | 612.4 | time | 616.2 | london | 594.6 | business | 159.8 | good | 1051.3 | airway | 582.0 | seat | 795.9 | customer | 520.8 |
| 3 | flight | 357.7 | ba | 569.8 | ba | 471.5 | flight | 63.6 | flight | 1030.1 | food | 567.3 | airline | 490.0 | service | 376.8 |
| 4 | airway | 299.6 | cabin | 551.9 | bag | 459.9 | ba | 59.6 | food | 824.4 | economy | 533.0 | staff | 480.8 | british | 288.7 |
| 5 | british | 290.9 | service | 494.5 | heathrow | 450.0 | passenger | 42.3 | crew | 814.0 | service | 497.2 | one | 390.6 | airway | 285.8 |
| 6 | ba | 284.0 | drink | 437.4 | time | 436.4 | facing | 40.5 | cabin | 685.0 | seat | 462.8 | service | 372.1 | refund | 232.8 |
| 7 | would | 267.9 | good | 361.7 | airway | 406.9 | one | 40.5 | service | 585.5 | london | 425.4 | hour | 328.2 | call | 228.8 |
| 8 | economy | 264.7 | lounge | 350.4 | british | 404.7 | first | 32.7 | lhr | 565.6 | airline | 404.3 | food | 316.7 | day | 219.5 |
| 9 | get | 250.1 | food | 336.5 | check | 344.6 | service | 32.6 | club | 563.9 | meal | 381.2 | london | 299.5 | airline | 212.0 |
from sklearn.decomposition import NMF
nmf = NMF(n_components=2, init='random', random_state=0)
nmf.fit_transform(tf)
array([[0. , 0.07167139],
[0.0715827 , 0.05867791],
[0.02121372, 0.0350061 ],
...,
[0.11298302, 0.15944847],
[0.02491683, 0.03516653],
[0.09953257, 0. ]])
topic_dict = {}
#loop through model components
for topic_idx, topic in enumerate(nmf.components_):
topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(tf_feature_names[i])
for i in topic.argsort()[:-10 - 1:-1]]
topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
for i in topic.argsort()[:-10 - 1:-1]]
df_topic =pd.DataFrame(topic_dict)
df_topic
| Topic 0 words | Topic 0 weights | Topic 1 words | Topic 1 weights | |
|---|---|---|---|---|
| 0 | seat | 24.3 | flight | 22.3 |
| 1 | ba | 12.5 | ba | 7.6 |
| 2 | class | 10.3 | hour | 4.9 |
| 3 | business | 8.0 | time | 4.4 |
| 4 | food | 7.8 | london | 4.0 |
| 5 | cabin | 7.5 | service | 3.8 |
| 6 | service | 7.4 | airway | 3.5 |
| 7 | good | 6.6 | british | 3.5 |
| 8 | crew | 6.5 | would | 3.1 |
| 9 | economy | 6.4 | staff | 2.8 |